set.seed(12345)
source("ingest_data.R")
MODEL_CACHE <- "added-M_pop_A_team_team_repo_RDC_zi_sd025_gamma01"
Models are stored in the following directory, which must exist prior to knitting this document:
cat(normalizePath(paste(getwd(), dirname(cachefile(MODEL_CACHE)), sep="/"), mustWork = T))
## /home/app/.cache
The used cache directory can be controlled via the cache parameter to Rmd - it can be useful to experiment with this parameter if you Knit the document manually in RStudio.
We can experiment with a narrower shape prior by configuring it to (for instance) gamma(0.1, 0.1), and running the full model based on this prior. The conclusions and tendencies should be the same, or similar, but the choice of prior might affect how fast the model converges, and if there are diagnostic problems (e.g. divergent transitions).
d <- data |> select(y=INTROD,
A=A,
C=C,
D=D,
R=R,
team=committerteam,
repo=repo)
formula <- bf(y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo),
zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo))
get_prior(data=d,
family=zero_inflated_negbinomial,
formula=formula)
## prior class coef group resp dpar nlpar lb ub
## (flat) b
## (flat) b A
## (flat) b C
## (flat) b D
## (flat) b R
## lkj(1) cor
## lkj(1) cor team
## lkj(1) cor team:repo
## student_t(3, -2.3, 2.5) Intercept
## student_t(3, 0, 2.5) sd 0
## student_t(3, 0, 2.5) sd team 0
## student_t(3, 0, 2.5) sd C team 0
## student_t(3, 0, 2.5) sd D team 0
## student_t(3, 0, 2.5) sd Intercept team 0
## student_t(3, 0, 2.5) sd R team 0
## student_t(3, 0, 2.5) sd team:repo 0
## student_t(3, 0, 2.5) sd C team:repo 0
## student_t(3, 0, 2.5) sd D team:repo 0
## student_t(3, 0, 2.5) sd Intercept team:repo 0
## student_t(3, 0, 2.5) sd R team:repo 0
## gamma(0.01, 0.01) shape 0
## (flat) b zi
## (flat) b A zi
## (flat) b C zi
## (flat) b D zi
## (flat) b R zi
## logistic(0, 1) Intercept zi
## student_t(3, 0, 2.5) sd zi 0
## student_t(3, 0, 2.5) sd team zi 0
## student_t(3, 0, 2.5) sd C team zi 0
## student_t(3, 0, 2.5) sd D team zi 0
## student_t(3, 0, 2.5) sd Intercept team zi 0
## student_t(3, 0, 2.5) sd R team zi 0
## student_t(3, 0, 2.5) sd team:repo zi 0
## student_t(3, 0, 2.5) sd C team:repo zi 0
## student_t(3, 0, 2.5) sd D team:repo zi 0
## student_t(3, 0, 2.5) sd Intercept team:repo zi 0
## student_t(3, 0, 2.5) sd R team:repo zi 0
## source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
priors <- c(prior(normal(0, 0.5), class = Intercept),
prior(normal(0, 0.25), class = b),
prior(weibull(2, 0.25), class = sd),
prior(weibull(2, 0.25), class = sd, group=team:repo),
prior(lkj(2), class = cor),
prior(normal(0, 0.5), class = Intercept, dpar=zi),
prior(normal(0, 0.25), class = b, dpar=zi),
prior(weibull(2, 0.25), class = sd, dpar=zi),
prior(weibull(2, 0.25), class = sd, group=team:repo, dpar=zi),
prior(gamma(0.1, 0.1), class = shape))
(v <- validate_prior(prior=priors,
formula=formula,
data=d,
family=zero_inflated_negbinomial)
)
## prior class coef group resp dpar nlpar lb ub
## normal(0, 0.25) b
## normal(0, 0.25) b A
## normal(0, 0.25) b C
## normal(0, 0.25) b D
## normal(0, 0.25) b R
## normal(0, 0.25) b zi
## normal(0, 0.25) b A zi
## normal(0, 0.25) b C zi
## normal(0, 0.25) b D zi
## normal(0, 0.25) b R zi
## normal(0, 0.5) Intercept
## normal(0, 0.5) Intercept zi
## lkj_corr_cholesky(2) L
## lkj_corr_cholesky(2) L team
## lkj_corr_cholesky(2) L team:repo
## weibull(2, 0.25) sd 0
## weibull(2, 0.25) sd zi 0
## weibull(2, 0.25) sd team 0
## weibull(2, 0.25) sd C team 0
## weibull(2, 0.25) sd D team 0
## weibull(2, 0.25) sd Intercept team 0
## weibull(2, 0.25) sd R team 0
## weibull(2, 0.25) sd team zi 0
## weibull(2, 0.25) sd C team zi 0
## weibull(2, 0.25) sd D team zi 0
## weibull(2, 0.25) sd Intercept team zi 0
## weibull(2, 0.25) sd R team zi 0
## weibull(2, 0.25) sd team:repo 0
## weibull(2, 0.25) sd C team:repo 0
## weibull(2, 0.25) sd D team:repo 0
## weibull(2, 0.25) sd Intercept team:repo 0
## weibull(2, 0.25) sd R team:repo 0
## weibull(2, 0.25) sd team:repo zi 0
## weibull(2, 0.25) sd C team:repo zi 0
## weibull(2, 0.25) sd D team:repo zi 0
## weibull(2, 0.25) sd Intercept team:repo zi 0
## weibull(2, 0.25) sd R team:repo zi 0
## gamma(0.1, 0.1) shape 0
## source
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## user
## user
## (vectorized)
## (vectorized)
## user
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
The same structure as in \(\mathcal{M}_4\), but a single prior is changed to gamma(0.1, 0.1).
M_ppc <- brm(data = d,
family = zero_inflated_negbinomial,
formula = formula,
prior = priors,
file = cachefile(paste0("ppc-",MODEL_CACHE)),
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
sample_prior = "only",
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
m <- M_ppc
yrep <- posterior_predict(m, newdata = d) # or nd
ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0)) + ggtitle("Prior predicted proportion of zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We see that the changed shape prior causes the model to be more skeptical of seeing duplicates—the most likely proportion of zeros in predicted data is now 1.0. But we also see that the model does allow between about 0.5–0.95 proportion of zeros, so the model does not rule out that duplicates appear.
(sim_max <- ppc_stat(y = d$y, yrep, stat = "max") + ggtitle("Prior predicted max values")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Scaling to more reasonable values
sim_max + scale_x_continuous(limits = c(0,1000)) + ggtitle("Prior predicted max values up to 1000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The maximum predicted value is more spread out, compared to \(\mathcal{M}_4\) (lower peak at 0, slightly higher distribution at 1000), but the overall distribution shape looks similar.
ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Prior predicted Q99 vs. observed value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
At the 99th percentile, we again see that this model is more skeptical of seeing duplicates. The most likely value includes zero.
p <- ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + theme(legend.position="bottom") #+ ggtitle("Prior predicted Q95 vs Q99")
p
The 2-d plot confirms the conclusion—95th percentile now ranges to about 20, whereas \(\mathcal{M}_4\) ranged to about 30.
The standard deviation of the predictions is a bit harder to grasp intuitively. But our prior information encompass the observed value well.
(p <- ppc_stat(y = d$y, yrep, stat = "sd") + ggtitle("Prior predicted stddev vs. observed value")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Zooming in to show the distribution in more detail.
p + scale_x_continuous(limits=c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Lower peak at zero, thicker tail around 20–30.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per repository")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
M_pop_A_team_team_repo_zi_sd025_model <-
brm(data = d,
family = zero_inflated_negbinomial,
file = cachefile(MODEL_CACHE),
formula = formula,
prior = priors,
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
M_pop_A_team_team_repo_zi_sd025_model <- add_criterion(M_pop_A_team_team_repo_zi_sd025_model, "loo")
m <- M_pop_A_team_team_repo_zi_sd025_model
p <- mcmc_trace(m)
pars <- levels(p[["data"]][["parameter"]])
plots <- seq(1, to=length(pars), by=12)
lapply(plots, function(i) {
start <- i
end <- start+11
mcmc_trace(m, pars = na.omit(pars[start:end]))
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
##
## [[58]]
##
## [[59]]
##
## [[60]]
##
## [[61]]
##
## [[62]]
##
## [[63]]
##
## [[64]]
##
## [[65]]
##
## [[66]]
##
## [[67]]
##
## [[68]]
The “caterpillar plots” show that the chains mixed well also for this model.
mcmc_plot(m, type="rhat")
mcmc_plot(m, type="rhat_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mcmc_plot(m, type="neff")
mcmc_plot(m, type="neff_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Just as for \(\mathcal{M}_4\), the MCMC chains, \(\hat{R}\) and \(N_{eff}\) ratio look good.
loo <- loo(m)
loo
##
## Computed from 12000 by 31007 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -8486.3 163.1
## p_loo 258.6 16.5
## looic 16972.5 326.2
## ------
## MCSE of elpd_loo is NA.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.1, 1.6]).
##
## Pareto k diagnostic values:
## Count Pct. Min. ESS
## (-Inf, 0.7] (good) 30994 100.0% 674
## (0.7, 1] (bad) 10 0.0% <NA>
## (1, Inf) (very bad) 3 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
plot(loo)
There are 13 outlier points that need to be analysed (e.g. with reloo) (10 over 0.7 and 3 over 1.0)
Reloo is disabled by default, but could be enabled by setting the RELOO environment variable (which in turn will set the reloo parameter of this document.
reloofile <- cachefile(paste0("reloo-", MODEL_CACHE, ".rds"))
if (file.exists(reloofile)) {
(reloo <- readRDS(reloofile))
} else {
Sys.time()
(reloo <- reloo(m, loo, chains=CHAINS, cores=CORES) )
Sys.time()
saveRDS(reloo, reloofile)
}
##
## Computed from 12000 by 31007 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -8492.7 163.8
## p_loo 265.0 18.9
## looic 16985.4 327.6
## ------
## MCSE of elpd_loo is 0.7.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.1, 1.6]).
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
Note that the reloo results are cached, and included in the docker image by default. If you want to reproduce the reloo manually, set a separate cache dir (and mount it into the docker image), and enable the RELOO environment variable. It will take more than one day to complete all reloo, so it is best done over a weekend.
# plotting the recalculated values
plot(reloo)
# which points have higher pareto_k than 0.5?
influencers <- data[loo::pareto_k_ids(reloo),]
influencers |> group_by(repo,committerteam) |> tally()
## # A tibble: 0 × 3
## # Groups: repo [0]
## # ℹ 3 variables: repo <fct>, committerteam <fct>, n <int>
According to the reloo function, all Pareto-k values are below 0.7, so we should be able to trust the model inferences.
yrep <- posterior_predict(m)
ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of zeros is marginally higher than the observed value, but it still encompasses the observed proportion.
(sim_max <- ppc_stat(y = d$y, yrep, stat = "max")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sim_max + scale_x_continuous(limits = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of predicted max values, after training, for this model, is similar to the observed max value, but very similar to \(\mathcal{M}_4\).
ppc_stat(y = d$y, yrep, stat = "sd")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comparing the x-axis scale, we see that this model is expecting slightly lower standard deviation (range goes to 30 instead of 40, as for\(\mathcal{M}_4\)).
ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Posterior predicted Q99")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + ggtitle("Posterior predicted Q95 vs Q99")
Identical to \(\mathcal{M}_4\).
ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$team) + ggtitle("Posterior predictive max observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Compared to \(\mathcal{M}_4\), this model is more likely to predict lower number of duplicates. So the team predicted max values will, for the most part, be lower than for \(\mathcal{M}_4\).
ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$repo) + ggtitle("Posterior predictive max observation per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Posterior max per repository have some variation, in particular in Saturn and Jupiter.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + ggtitle("Posterior predictive 99% quartile per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The 99th percentile value predictions seem very well fitted. Predictions surround the observation nicely.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + ggtitle("Posterior predictive 99% quartile per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
(rootogram <- pp_check(m, type = "rootogram", style="suspended")
)
## Using all posterior draws for ppc type 'rootogram' by default.
We need to size the rootogram to see anything of value.
rootogram + scale_x_continuous(limits=c(0,50)) + ggtitle("Suspended rootogram, scaled to 0-50 prediction interval")
## Warning: Removed 5151 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 5149 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
The posterior predictions in general work well. Though the single outlier in Saturn is not picked up (as most other changes there yield very few duplicates). Though our priors does allow the outliers to shine though, they do relegate them as very unlikely (most max predictions are in the 10s or 20s, most).
source("predict_utils.R")
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates = mean(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = q99(data$COMPLEX), duplicates = q99(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = median(data$COMPLEX), duplicates = median(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity = quantile(data$COMPLEX, 0.25), duplicates = quantile(data$DUP, 0.25), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q95(x)), "Quantile 95%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q99(x)), "Quantile 99%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity=q95(data$COMPLEX), duplicates = q95(data$DUP), summary=function(x) q99(x)), "Quantile 99%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity=q99(data$COMPLEX), duplicates = q99(data$DUP), summary=function(x) quantile(x, 0.75)), "Quantile 75%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=quantile(data$ADD, 0.99), removed=quantile(data$DEL, 0.1), complexity=mean(data$COMPLEX), duplicates = mean(data$DUP), summary=function(x) median(x)), "Median a posteriori", decimals=0)
The various heatmaps all look similar to the findings from \(\mathcal{M}_4\).
summary(m)
## Family: zero_inflated_negbinomial
## Links: mu = log; shape = identity; zi = logit
## Formula: y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo)
## zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo)
## Data: d (Number of observations: 31007)
## Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
## total post-warmup draws = 12000
##
## Group-Level Effects:
## ~team (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept) 0.26 0.11 0.06 0.49 1.00 4307
## sd(R) 0.17 0.08 0.04 0.36 1.00 5000
## sd(C) 0.21 0.09 0.05 0.41 1.00 5415
## sd(D) 0.11 0.07 0.02 0.27 1.00 6907
## sd(zi_Intercept) 0.15 0.08 0.03 0.34 1.00 6569
## sd(zi_R) 0.26 0.11 0.06 0.49 1.00 4110
## sd(zi_C) 0.17 0.09 0.03 0.39 1.00 6185
## sd(zi_D) 0.16 0.09 0.03 0.35 1.00 6614
## cor(Intercept,R) 0.14 0.36 -0.59 0.77 1.00 7315
## cor(Intercept,C) 0.06 0.36 -0.64 0.71 1.00 8828
## cor(R,C) 0.11 0.36 -0.62 0.75 1.00 6983
## cor(Intercept,D) 0.14 0.38 -0.62 0.79 1.00 10758
## cor(R,D) -0.02 0.38 -0.71 0.69 1.00 10901
## cor(C,D) 0.01 0.38 -0.70 0.71 1.00 9948
## cor(zi_Intercept,zi_R) 0.00 0.38 -0.70 0.71 1.00 5113
## cor(zi_Intercept,zi_C) -0.11 0.38 -0.76 0.65 1.00 10045
## cor(zi_R,zi_C) 0.09 0.38 -0.65 0.76 1.00 9622
## cor(zi_Intercept,zi_D) -0.00 0.38 -0.71 0.71 1.00 10231
## cor(zi_R,zi_D) -0.14 0.38 -0.79 0.62 1.00 10098
## cor(zi_C,zi_D) -0.08 0.38 -0.76 0.66 1.00 9542
## Tail_ESS
## sd(Intercept) 5394
## sd(R) 6091
## sd(C) 5124
## sd(D) 7278
## sd(zi_Intercept) 6996
## sd(zi_R) 4407
## sd(zi_C) 6619
## sd(zi_D) 7077
## cor(Intercept,R) 8565
## cor(Intercept,C) 7845
## cor(R,C) 7848
## cor(Intercept,D) 9931
## cor(R,D) 9530
## cor(C,D) 9799
## cor(zi_Intercept,zi_R) 7809
## cor(zi_Intercept,zi_C) 8980
## cor(zi_R,zi_C) 9225
## cor(zi_Intercept,zi_D) 8277
## cor(zi_R,zi_D) 9137
## cor(zi_C,zi_D) 10056
##
## ~team:repo (Number of levels: 84)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept) 0.47 0.07 0.33 0.62 1.00 4705
## sd(R) 0.32 0.06 0.21 0.45 1.00 4724
## sd(C) 0.34 0.09 0.17 0.52 1.00 2131
## sd(D) 0.21 0.05 0.12 0.33 1.00 4860
## sd(zi_Intercept) 0.59 0.08 0.44 0.75 1.00 6019
## sd(zi_R) 0.35 0.08 0.21 0.52 1.00 3296
## sd(zi_C) 0.41 0.11 0.19 0.64 1.00 2260
## sd(zi_D) 0.37 0.08 0.22 0.54 1.00 4877
## cor(Intercept,R) -0.23 0.21 -0.62 0.19 1.00 4068
## cor(Intercept,C) -0.15 0.23 -0.56 0.33 1.00 4881
## cor(R,C) 0.25 0.24 -0.23 0.70 1.00 3738
## cor(Intercept,D) 0.01 0.25 -0.48 0.49 1.00 5685
## cor(R,D) -0.32 0.26 -0.76 0.25 1.00 5783
## cor(C,D) -0.28 0.29 -0.79 0.31 1.00 3871
## cor(zi_Intercept,zi_R) -0.35 0.20 -0.73 0.07 1.00 4605
## cor(zi_Intercept,zi_C) -0.47 0.20 -0.81 -0.02 1.00 4811
## cor(zi_R,zi_C) 0.17 0.29 -0.40 0.72 1.00 3166
## cor(zi_Intercept,zi_D) 0.07 0.22 -0.38 0.49 1.00 5450
## cor(zi_R,zi_D) 0.15 0.26 -0.36 0.65 1.00 4529
## cor(zi_C,zi_D) 0.01 0.30 -0.53 0.60 1.00 2627
## Tail_ESS
## sd(Intercept) 7535
## sd(R) 7094
## sd(C) 5005
## sd(D) 6323
## sd(zi_Intercept) 8211
## sd(zi_R) 4887
## sd(zi_C) 4044
## sd(zi_D) 7478
## cor(Intercept,R) 6362
## cor(Intercept,C) 7490
## cor(R,C) 6218
## cor(Intercept,D) 7932
## cor(R,D) 7973
## cor(C,D) 7155
## cor(zi_Intercept,zi_R) 6227
## cor(zi_Intercept,zi_C) 7206
## cor(zi_R,zi_C) 4899
## cor(zi_Intercept,zi_D) 7782
## cor(zi_R,zi_D) 6594
## cor(zi_C,zi_D) 5085
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -1.51 0.15 -1.80 -1.21 1.00 6115 7022
## zi_Intercept 1.90 0.16 1.59 2.20 1.00 5561 7741
## A 1.07 0.05 0.98 1.16 1.00 7191 8158
## R 0.09 0.09 -0.09 0.28 1.00 6542 8155
## C 0.30 0.12 0.05 0.53 1.00 5223 6334
## D 0.29 0.08 0.12 0.45 1.00 6727 7953
## zi_A -1.16 0.06 -1.27 -1.05 1.00 8284 8699
## zi_R 0.43 0.13 0.16 0.65 1.00 6211 8056
## zi_C 0.33 0.14 0.05 0.59 1.00 4878 5347
## zi_D -0.34 0.11 -0.54 -0.13 1.00 6809 8007
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 1.04 0.09 0.88 1.22 1.00 9149 8957
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
source("conditional_effects.R")
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter")
(red_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Jupiter") + labs(title=NULL, subtitle=NULL, legend=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() + scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(red_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() + ylab(NULL) + scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(red_plot <- ggarrange(red_jupiter_plot, red_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Green", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Green", "Jupiter")
(blue_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + theme(legend.position = "none") + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(blue_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + ylab(NULL) + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(blue_plot <- ggarrange(blue_jupiter_plot, blue_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(complexity_plot <- ggarrange(plotlist = list(blue_jupiter_plot, blue_uranus_plot, red_jupiter_plot, red_uranus_plot), labels="AUTO", nrow=2, ncol=2, common.legend = T, legend="bottom")
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(red_venus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Venus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Venus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Neptune", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Neptune") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Saturn", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Saturn") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "IntTest", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "IntTest") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mars", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mars") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mercury", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mercury") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
The conditional effects plot also all look similar to \(\mathcal{M}_4\).
data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(desc(n))
## # A tibble: 7 × 2
## INTROD n
## <int> <int>
## 1 0 968
## 2 1 44
## 3 2 23
## 4 3 5
## 5 4 4
## 6 5 2
## 7 17 1
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 13 × 2
## INTROD n
## <int> <int>
## 1 0 2080
## 2 1 35
## 3 2 18
## 4 3 2
## 5 4 9
## 6 5 4
## 7 6 3
## 8 7 8
## 9 8 2
## 10 9 2
## 11 19 1
## 12 22 1
## 13 23 1
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 6 × 2
## INTROD n
## <int> <int>
## 1 0 214
## 2 1 5
## 3 2 7
## 4 5 2
## 5 6 1
## 6 11 2
data |> filter(committerteam=="Red", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 5 × 2
## INTROD n
## <int> <int>
## 1 0 348
## 2 1 19
## 3 2 11
## 4 3 7
## 5 4 1
data |> filter(committerteam=="Red", repo=="Venus") |> tally()
## n
## 1 231
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 32 × 2
## `trunc(COMPLEX/5)` n
## <dbl> <int>
## 1 0 40
## 2 1 18
## 3 2 19
## 4 3 26
## 5 4 19
## 6 5 16
## 7 6 8
## 8 7 2
## 9 8 5
## 10 9 7
## # ℹ 22 more rows
data |> filter(committerteam=="Red", repo=="Jupiter") |> tally()
## n
## 1 2166
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 89 × 2
## `trunc(COMPLEX/5)` n
## <dbl> <int>
## 1 0 413
## 2 1 301
## 3 2 223
## 4 3 179
## 5 4 131
## 6 5 138
## 7 6 77
## 8 7 67
## 9 8 62
## 10 9 33
## # ℹ 79 more rows
data |> filter(committerteam=="Red") |> group_by(repo) |> tally() |> arrange(desc(n))
## # A tibble: 8 × 2
## repo n
## <fct> <int>
## 1 Jupiter 2166
## 2 Saturn 1191
## 3 IntTest 617
## 4 Uranus 386
## 5 Venus 231
## 6 Neptune 215
## 7 Mercury 211
## 8 Mars 130
data |> filter(committerteam=="Blue", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
## INTROD n
## <int> <int>
## 1 0 654
## 2 1 8
## 3 2 4
## 4 3 1
## 5 4 1
## 6 5 1
## 7 8 1
data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
## INTROD n
## <int> <int>
## 1 0 968
## 2 1 44
## 3 2 23
## 4 3 5
## 5 4 4
## 6 5 2
## 7 17 1
data |> filter(COMPLEX==max(COMPLEX))
## repo commit file ISTEST ISNEW ISREMOVED
## 1 Jupiter 52efadf06bb61941887114298aa281c277e1f8a0 910 FALSE FALSE FALSE
## 2 Jupiter 6de4ea0ed3207a898be965f3901efc5f0339bc0e 910 FALSE FALSE FALSE
## author authorteam
## 1 8ff0b36b3305550dc55c9097f3b970c599959326 Yellow
## 2 230d4e806215564296e126c9b532f1936adaaa89 Red
## committer committerteam churn addedComplexity
## 1 0e33fff6868e21d94d344c4bf2fcd81285b2c3e2 Arch 3 0
## 2 230d4e806215564296e126c9b532f1936adaaa89 Red 113 10
## removedComplexity cADD cDEL cSIZE cINTROD cREMOVED ADD DEL REASON CLOC
## 1 0 582 879 1461 0 0 3 3 I 9443
## 2 0 135 34 169 0 0 113 32 F 9443
## COMPLEX DUP logcADD logcDEL logcSIZE logcREMOVED logADD logDEL
## 1 1244 25 6.368187 6.779922 7.287561 0 1.386294 1.386294
## 2 1244 25 4.912655 3.555348 5.135798 0 4.736198 3.496508
## logCOMPLEX logDUP INTROD REMOVED A R C scaleDUP
## 1 7.126891 3.258097 0 0 -0.4751701 -0.1458825 2.537155 0.5413155
## 2 7.126891 3.258097 0 0 1.6152839 1.2642792 2.537155 0.5413155
## D cA cR cSZ cREM cREMlog
## 1 2.058662 0.67129576 1.044691 0.6849683 -0.2146616 -0.455959
## 2 2.058662 0.03418981 -0.252292 -0.3619011 -0.2146616 -0.455959
teamBlue <- condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter")
teamArch <- condeffect_logADD_by_logCOMPLEX(m, d, "Arch", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamBlue)
teamBlue |> mutate(added=unscale_added(A), complexity=factor(trunc(unscale_complexity(as.integer(C))))) |> select(complexity) |> summary()
## complexity
## 14 :6000
## 489:4000
d |> sample_n(10) |> select(y, A, C, team, repo) |> mutate(foo = trunc(unscale_complexity(round(C))))
## y A C team repo foo
## 1 0 -0.2221461 0.55867729 Brown Saturn 85
## 2 0 0.5291739 -1.16528154 Blue Saturn 2
## 3 0 -0.9077175 -0.76639503 Green Jupiter 2
## 4 0 0.2603507 -1.56416806 Red Saturn -1
## 5 0 -1.3402650 1.15630936 Pink Saturn 85
## 6 0 -0.1259509 1.85875713 Green IntTest 489
## 7 0 1.3109406 -0.76639503 Green Mars 2
## 8 0 0.5886506 -0.04546552 Yellow Uranus 14
## 9 0 1.6472927 0.97177370 Arch IntTest 85
## 10 0 -0.6546935 0.41199404 Yellow IntTest 14
#mutate(truncC=factor(trunc(unscale_complexity(round(C, 0)))), added=unscale_added(A), complexity=factor(trunc(unscale_complexity(round(C, 0))))) |> select(complexity)
plot_logADD_by_logCOMPLEX(m, d, teamArch)
teamBrown <- condeffect_logADD_by_logCOMPLEX(m, d, "Brown", "IntTest")
plot_logADD_by_logCOMPLEX(m, d, teamBrown)
teamUI <- condeffect_logADD_by_logCOMPLEX(m, d, "UI", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamUI)
(p <- mcmc_areas(m, regex_pars = c("^b_")) + theme_bw() + ggtitle("Population-level beta parameter distributions"))
Not surprising, the population-level betas are similar to \(\mathcal{M}_4\).
mcmc_areas(m, regex_pars = c("^sd_"))
mcmc_areas(m, regex_pars = c("^r_team[[].*,Intercept"))
mcmc_areas(m, regex_pars = c("^r_team.*[[]Red,.*[]]"))
mcmc_areas(m, regex_pars = c("^r_team:repo.*[[].*Red_IntTest,.*[]]"))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "IntTest", robust=T)) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Neptune")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Uranus")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "Jupiter")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Neptune", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,215))
)
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))+ scale_y_continuous(limits=c(0,175))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Jupiter", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
jupiter_stats <- data |> filter(repo == "Jupiter") |> summarize(remQ95 = q95(DEL), compQ95 = q95(COMPLEX), remQ99 = q99(DEL), compQ99 = q99(COMPLEX))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Jupiter", removed=jupiter_stats$remQ95, complexity = jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "IntTest", removed=jupiter_stats$remQ95, complexity=jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + scale_y_continuous(limits = c(0,400))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "IntTest", removed=q95(data$DEL), complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "NewTeam", "Saturn")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Saturn", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Saturn", removed=q95(data$DUP), duplicates = q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Orange", "Venus", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Yellow", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
All the conditional effects plot are virtually the same for this model and \(\mathcal{M}_4\). Which is natural, given that the posterior population-level predictors were having very similar distributions.
source("predict_utils.R")
Q95 change in a highly complex file
(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Jupiter", "IntTest", "Aniara", "Venus"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.25,1.0)) + scale_x_continuous(limits = c(0,30)) )
## Warning: Removed 4714 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
duplicates_probability_for_team <- function(predictions, aTeam) {
params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
stopifnot(length(params$added) == 1)
predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred0/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
ggtitle(paste("Probability of any duplicate per repository for team", aTeam),
paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates, 0))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 0)") + scale_y_continuous(limits=c(0,1))
}
more_five_probability_for_team <- function(predictions, aTeam) {
params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
stopifnot(length(params$added) == 1)
predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred5/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
ggtitle(paste("Probability of more than five duplicates per repository for team", aTeam),
paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates,0))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 5)") + scale_y_continuous(limits=c(0,1))
}
pred <- onepred(added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))
(p <- duplicates_probability_for_team(pred, "Blue"))
(p <- more_five_probability_for_team(pred, "Blue"))
Observed number of introduced duplicates.
(p <- data |> filter(repo == MARS, committerteam == GREEN) |> ggplot(aes(x=DUP, y=COMPLEX, size=INTROD, color=INTROD)) + geom_point() + geom_jitter() + scale_y_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100, 250, 500, 750), trans="log1p") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_color_distiller(palette = "Spectral") + theme_bw()
)
data |> filter(repo == MARS) |> group_by(committerteam) |> filter(INTROD == max(INTROD)) |> select(committerteam, INTROD, DUP) |> arrange(desc(INTROD))
## # A tibble: 28 × 3
## # Groups: committerteam [10]
## committerteam INTROD DUP
## <fct> <int> <int>
## 1 Brown 24 86
## 2 Blue 14 72
## 3 Green 10 0
## 4 Violet 5 0
## 5 Yellow 2 1
## 6 Yellow 2 1
## 7 Red 2 0
## 8 Yellow 2 0
## 9 Orange 1 1
## 10 Orange 1 1
## # ℹ 18 more rows
(p <- data |> filter(repo == MARS, committerteam == GREEN) |> group_by(INTROD) |> ggplot(aes(x=DUP, group=INTROD, fill=INTROD)) + geom_bar(position="stack") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_fill_distiller(palette = "Spectral") + theme_bw()
)
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest", "Mars"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.50,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 3072 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
allrepos <- c("Aniara", "Jupiter", "IntTest", "Mercury", "Venus", "Mars", "Saturn", "Neptune", "Uranus")
(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Brown"), repo=allrepos, added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.20,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 10264 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=median(data$DUP), complex=median(data$COMPLEX))) + scale_y_continuous(limits = c(0.70,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=20, complex=170)) + scale_y_continuous(limits = c(0.35,1)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 1176 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
data |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## q95(ADD) q95(DEL) q95(COMPLEX) q95(DUP)
## 1 143 92 282 36
data |> group_by(repo) |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## # A tibble: 8 × 5
## repo `q95(ADD)` `q95(DEL)` `q95(COMPLEX)` `q95(DUP)`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 IntTest 216 127. 558 110
## 2 Jupiter 120 74.4 177 19
## 3 Mars 146. 114 154. 5
## 4 Mercury 144 106 282 6
## 5 Neptune 231 156 180 9
## 6 Saturn 101. 72.6 320 9
## 7 Uranus 158. 95.9 89.9 4
## 8 Venus 154 84.5 132 4
data |> group_by(repo) |> summarize(q99(ADD), q99(DEL), q99(COMPLEX), q99(DUP))
## # A tibble: 8 × 5
## repo `q99(ADD)` `q99(DEL)` `q99(COMPLEX)` `q99(DUP)`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 IntTest 638. 511. 633 620.
## 2 Jupiter 298. 246. 1056 32
## 3 Mars 316. 557. 661 72
## 4 Mercury 304. 283. 374. 16.9
## 5 Neptune 511 474 216 28
## 6 Saturn 239 209. 758 18
## 7 Uranus 291. 273. 126. 9
## 8 Venus 433 374. 260 11
Strange that teams in the Neptune repo seems very simlar for this change? Neptune repo is mostly static, no team owning it. Mostly abandoned code.
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "Venus"), added=q95(data$ADD), removed=q95(data$DEL))) + scale_y_continuous(limits = c(0.6,1)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 344 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "IntTest"), added=mean(data$ADD), removed=mean(data$DEL))) + scale_y_continuous(limits = c(0.2,1.02)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates=mean(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 327 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.3,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2081 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2427 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 6215 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q99(data$ADD), removed=q99(data$DEL), complexity = median(data$COMPLEX), duplicates=median(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 2585 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).